library(stringr)
## Warning: package 'stringr' was built under R version 3.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
library(choroplethr)
## Warning: package 'acs' was built under R version 3.5.2
library(choroplethrMaps)
library(leaflet)
library(ggmap)
## Warning: package 'ggplot2' was built under R version 3.5.2
library(ggplot2)
library(readr)
library(RColorBrewer)
library(tm)
library(wordcloud)
library(ggthemes)
library(reshape2)

Data Cleaning and Transformation

This section examines immunization surveys collected at the school level in New York State, beginning with the 2012-2013 school year.

First, we examine the trends in exemption rates over time and look at immunization rates at the County level.

df <- read.csv("/Users/sanjnashenoy/Github/Group_A_Health/School_Immunization_Survey__Beginning_2012-13_School_Year.csv")
df$year <- substr(df$Report.Period, 6, 9)
df$County <- tolower(df$County)
df <- df %>% 
  select(School.ID, Type, School.Name, Percent.Medical.Exemptions, Percent.Religious.Exemptions, Percent.Immunized.Measles, Street, City, County, Zip.Code, Location, year)
df$lat <- as.character(df$Location) %>% strsplit("\n") %>% lapply(., "[", 3) %>% unlist() %>% gsub("[()]", "", .) %>% strsplit(., ",") %>% lapply(., "[", 1) %>% unlist() %>% as.character() %>% as.numeric()
df$lon <- as.character(df$Location) %>% strsplit("\n") %>% lapply(., "[", 3) %>% unlist() %>% gsub("[()]", "", .) %>% strsplit(., ",") %>% lapply(., "[", 2) %>% unlist() %>% as.numeric() 

Average Immunization Rates by Counties

risky <- df %>% filter(year == 2018) %>% filter(Percent.Immunized.Measles < 94) %>% arrange(Percent.Immunized.Measles)
## Joining, by = "region"
county_rates$region <- as.numeric(county_rates$region)
choroplethr::county_choropleth(county_rates, title = "Average Immunization Rates by County",num_colors = 7, state_zoom = "new york", legend = "Immunization Rates")

We identify the counties that have immunization rates less than 94. There are 15 counties. The bar chart shows a list of counties that are at risk of measles contagion.

county_rates <- df %>% filter(year == 2018) %>% group_by(County) %>% summarize(avg_immun = mean(Percent.Immunized.Measles)) %>% filter(avg_immun < 94)
ggplot(county_rates, aes(x=reorder(County, -avg_immun), y = avg_immun, fill = County)) + geom_bar(stat="identity") + coord_flip() + theme_few() + ggtitle("Average Immunization Rates by New York Counties (2018)") + theme(legend.position = "none") + xlab("County") + ylab("Average Immunization Rate") 

The top 5 counties are Montgomery,Yates, Allegany, Cattaraugus, and Seneca.

pal = colorFactor("Set1", domain = risky$Type)
color_type = pal(risky$Type)
content <- paste("School Name:",risky$School.Name,"<br/>",
                 "% Immunized:",risky$Percent.Immunized.Measles,"<br/>",
                 "% Religious Exemption",risky$Percent.Religious.Exemptions,"<br/>",
                 "% Medical Exemption",risky$Percent.Medical.Exemptions,"<br/>")

The interactive map below shows the schools in New York that have immunization rates below the ideal threshold needed to achieve herd immunity. We have identified 574 schools as being risky

m <- leaflet(risky) %>% 
  addTiles() %>%
  addCircles(lat= ~lat, lng= ~lon, color = color_type, popup = content) %>% addLegend(pal = pal, values = ~risky$Type, title = "School Type")
## Warning in validateCoords(lng, lat, funcName): Data contains 15 rows with
## either missing or invalid lat/lon values and will be ignored
m  

Type of Risky Schools

ggplot(risky, aes(x=Type, fill = Type)) + geom_histogram(stat="count") + ggtitle("Type of Schools")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

The most common type of school in the risky dataset is private schools.

Text Analysis of Risky Schools

schools <- unique(risky$School.Name)
txt <- schools
frequentSchool = c("school","preschool","academy","elementary","sch", "institute", "center","elem")

myCorpus<-VCorpus(VectorSource(txt))
myCorpusClean <- myCorpus %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(content_transformer(removeNumbers)) %>% 
  tm_map(content_transformer(removePunctuation)) %>%
  tm_map(content_transformer(removeWords),frequentSchool) %>%
  tm_map(content_transformer(removeWords),tidytext::stop_words$word)

tdm_1<- TermDocumentMatrix(myCorpusClean, control = list(minWordLength = 3))
m_tdm_1 <-as.matrix(tdm_1)
word.freq.1<-sort(rowSums(m_tdm_1), decreasing=T)
head(word.freq.1, 5L)
##  christian    yeshiva montessori     valley       bais 
##         43         37         26         17         15

Christian is the most frequent term.

set.seed(12345)
wordcloud(words=names(word.freq.1),freq = word.freq.1,random.order=F,colors=brewer.pal(9,"Set1"),max.words=100)
title(paste0('Most frequent 1-grams in school names'),col.main='black',cex.main=2)

The most frequent words associated with schools that have low immunization rates is words like “christian”, “yeshiva”, and “montessori”. This refers to relgious institituions and alternatuve learning institutions.

Text Analysis of Riskiest Schools

riskiest <- risky %>% filter(Percent.Immunized.Measles == 0)
dim(riskiest)
## [1] 91 14

There are 91 schools where no child is vaccinated against measles.

schools <- unique(riskiest$School.Name)
txt <- schools
myCorpus<-VCorpus(VectorSource(txt))

myCorpusClean <- myCorpus %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(content_transformer(removeNumbers)) %>% 
  tm_map(content_transformer(removePunctuation)) %>%
  tm_map(content_transformer(removeWords),frequentSchool) %>%
  tm_map(content_transformer(removeWords),tidytext::stop_words$word)

tdm_1<- TermDocumentMatrix(myCorpusClean, control = list(minWordLength = 3))
m_tdm_1 <-as.matrix(tdm_1)
word.freq.1<-sort(rowSums(m_tdm_1), decreasing=T)

BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
tdm_2<- TermDocumentMatrix(myCorpusClean, control = list(tokenize = BigramTokenizer))
m_tdm_2 <-as.matrix(tdm_2)
word.freq.2<-sort(rowSums(m_tdm_2), decreasing=T)

set.seed(314)
wordcloud(words=names(word.freq.2),freq = word.freq.2,random.order=F,colors=brewer.pal(9,"Set1"),max.words=100)
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : childrens centerfingerlake could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : congregation mesifta could not be fit on page. It will
## not be plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : deerfield amish could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : dredge road could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : dygert road could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : echo valley could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : elm grove could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : glen meadows could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : glover hill could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : gravel run could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : grover lane could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : hill valley could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : hillside borden could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : hillside childrens could not be fit on page. It will
## not be plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : hope menonite could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : island baptist could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : jennings creek could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : jersey acres could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : maple row could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : meadow brook could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : meadow view could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : merchant hill could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : mill creek could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : mosher hollow could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : mud lake could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : pine creek could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : pine view could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : pleasant view could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : rainbow valley could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : ridge amish could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : river view could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : road schoo could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : rock island could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : rockly ridge could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : rocky ridge could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : rolling acres could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : round barn could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : rustic hollow could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : seager hill could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : shady maple could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : sister clara could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : snow ridge could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : spraque hill could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : sunny meadow could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : town line could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : twin maples could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : upward christian could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : west branch could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : willow grove could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(word.freq.2), freq = word.freq.2,
## random.order = F, : windy ridge could not be fit on page. It will not be
## plotted.
title(paste0('Most frequent 2-grams in school names'),col.main='black',cex.main=2)

Again, we see references to religious instituions like “muhammed”, “menonite”, and “amish”.

Focusing on New York City

df_nyc <- df %>% filter(year ==2018) %>% filter(City %in% c("BROOKLYN", "BRONX", "QUEENS", "STATEN ISLAND", "MANHATTAN", "NEW YORK")) %>% filter(Percent.Immunized.Measles < 94) %>% arrange(Percent.Immunized.Measles)
pal = colorFactor("Set1", domain = df_nyc$Type)
color_type = pal(df_nyc$Type)
content <- paste("School Name:",df_nyc$School.Name,"<br/>",
                 "% Immunized:",df_nyc$Percent.Immunized.Measles,"<br/>",
                 "% Religious Exemption",df_nyc$Percent.Religious.Exemptions,"<br/>",
                 "% Medical Exemption",df_nyc$Percent.Medical.Exemptions,"<br/>")
m2 <- leaflet(df_nyc) %>% addTiles() %>% addCircles(lng = ~lon, lat = ~lat, color = color_type, popup = content) %>% addLegend(pal = pal, values = ~df_nyc$Type, title = "School Type")
m2

There are 109 schools in New York City risky category. Every school below the 94% immunization threshold in NYC is a private school. Let’s look at them on a static plot.

newyork <- get_map("new york city", zoom=11)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=new%20york%20city&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=new+york+city&key=xxx
ggmap(newyork) + geom_point(data=df_nyc, aes(x=lon, y=lat))
## Warning: Removed 5 rows containing missing values (geom_point).

Let’s look at the medical exemption rate and the religious exemption rates

df_nyc$City <- factor(as.character(df_nyc$City))
ggplot(df_nyc, aes(x = City, y = Percent.Medical.Exemptions)) +
        geom_boxplot() + theme_few() + xlab("Percent Medical Exemptions") + ylab("City") + ggtitle("Medical Exemption Rates by City")

ggplot(df_nyc, aes(x = City, y = Percent.Religious.Exemptions)) +
        geom_boxplot() + theme_few() + xlab("Percent Religious Exemptions") + ylab("City") + ggtitle("Religious Exemption Rates by City")

Examining 2019 Measles Mandate

Under the NYC Measles Mandate, four zipcodes will be affected: 11205, 11206, 11211 and 11249.

mandate <- df %>% filter(year ==2018) %>% filter(Zip.Code %in% c(11205, 11206, 11211, 11249))
dim(mandate)
## [1] 44 14

This covers only 44 schools, whereas we have identifed 109 schools in New York City that are at risk. Let’s perform a text analysis on the school names to see what patterns emerge.

schools <- unique(mandate$School.Name)
txt <- schools
myCorpus<-VCorpus(VectorSource(txt))

myCorpusClean <- myCorpus %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(content_transformer(removeNumbers)) %>% 
  tm_map(content_transformer(removePunctuation)) %>%
 tm_map(content_transformer(removeWords),frequentSchool) %>%
  tm_map(content_transformer(removeWords),tidytext::stop_words$word)

tdm_1<- TermDocumentMatrix(myCorpusClean, control = list(minWordLength = 1))
m_tdm_1 <-as.matrix(tdm_1)
word.freq.1<-sort(rowSums(m_tdm_1), decreasing=T)
set.seed(12345)
wordcloud(words=names(word.freq.1),freq = word.freq.1,random.order=F,colors=brewer.pal(9,"Set1"),max.words=100)
title(paste0('Most frequent 1-grams in school names'),col.main='black',cex.main=2)

This is very interesting because we previously saw that christian was the most frequent word in the risky dataset.

ggplot(mandate, aes(x= reorder(School.Name, Percent.Immunized.Measles), y = Percent.Immunized.Measles, fill= School.Name)) + geom_bar(stat="identity") + coord_flip() + xlab("School Name") + ylab("Immunization Rates") + ggtitle("Measles Immunization Rates for Measles Mandate Schools")+ theme_few() + theme(legend.position = "none")

The barchart shows that many of the identified schools are well above the threshold for herd immunity.

above_thresh <- filter(mandate, Percent.Immunized.Measles >= 94)
dim(above_thresh)
## [1] 25 14

In fact, more than half of the schools in the zipcodes identified by the NYC Health Department have vaccination rates that are adequate for herd immunity.

Using Data to improve NYC Policy

Let’s look at average measles immunization rates by zipcodes. We will simply look at the top 5.

zip_nyc <- df %>% filter(year == 2018) %>% filter(City %in% c("BROOKLYN", "BRONX", "QUEENS", "STATEN ISLAND", "MANHATTAN", "NEW YORK")) %>% group_by(Zip.Code) %>% summarize(Avg.Immun.Rate = mean(Percent.Immunized.Measles)) %>% arrange(Avg.Immun.Rate) %>% head(5L)
zip_nyc
## # A tibble: 5 x 2
##   Zip.Code Avg.Immun.Rate
##      <int>          <dbl>
## 1    10026           47.6
## 2    11221           75.1
## 3    11237           75.8
## 4    10001           85.2
## 5    10038           85.7

Only 1 zipcode in the list above, (i.e. 11221), was correctly identified by the NYC Health Department. The zipcode 10026,with an average immunization rate of 47.6% across schools was not identified as a risky region.

Let’s look at which schools fall in the zip codes we identified above

schools <- df %>% filter(year==2018) %>% filter(Zip.Code %in% zip_nyc$Zip.Code)
as.character(schools$School.Name)
##  [1] "ST BRIGID CATHOLIC ACADEMY"                   
##  [2] "SCHOOL FOR YOUNG PERFORMERS"                  
##  [3] "NYSARC, INC. - NYC CHAPTER"                   
##  [4] "ATLAS SCHOOL"                                 
##  [5] "CHARLES CHURN CHRISTIAN ACADEMY"              
##  [6] "AVENUES: THE WORLD SCHOOL"                    
##  [7] "SISTER CLARA MUHAMMED ELEMENTARY SCHOOL"      
##  [8] "HARLEM ACADEMY"                               
##  [9] "HAWTHORNE COUNTRY DAY SCHOOL MANHATTAN CAMPUS"
## [10] "MANHATTAN STAR ACADEMY"                       
## [11] "KESWELL SCHOOL (THE)"                         
## [12] "ST FRANCES CABRINI SCHOOL"                    
## [13] "THE BLUE SCHOOL"                              
## [14] "LEARNING SPRING SCHOOL"

We argue that it mandatory measles vaccination policy must address the 14 schools above in the top 5 zipcodes we identified.